# Description ------------------------------------------------------------------

### This script downloads, combines, and cleans the data exchange  
### information then merges it with the data in the hand-coded 
### market_month_data.csv, the hand-coded incentives data, and the R-cleaned
### mobile money data

# Settings ---------------------------------------------------------------------
data_path <- 'data/1_raw/'
clean_path <- 'data/2_clean/'

# Modifications to Downloaded Data:

## (NOTE: These actions have already been completed in the version of the 
## Data Exchange/ folder in the replication archive.)

# The "MAY 18" sheet of the "Lilongwe Revenue DATA for MAY 2018.xlsx" file MUST
# be moved to be the first sheet for this script to run correctly

# "December 2018/balaka.xls" MUST be renamed to "December 2018/Balaka Dec 2018.xls"
# "Treatment Assignment" header MUST be added to "Balaka Dec 2018.xls" and "Balaka Nov 19.xls"
# documents (Nov 19 is a typo, but is inconsequential for this script)

# Replace "Mdoni" Market in "Mzimba DATA March 2018.xls" with "Monolo" (this was
# a DAI data entry error)

# Replace Nathenje's (Lilongwe) revenue value in May with 454,520 (value is 
# 4,545,200 because of a district council staff data entry error)

# Packages ---------------------------------------------------------------------
# if packages are not installed, they must be installed with:
# install.packages("package_name"); The name of package must be in quotes
library(dplyr)
library(readr)
library(stringr)
library(purrr)
library(readxl)
library(haven)
library(labelled)
library(hms)

# Necessary Functions ----------------------------------------------------------
source("scripts/0_functions/functions_cleaning.R")
source("scripts/0_functions/functions_data_exchange.R")


################################################################################
# Revenue Information ----------------------------------------------------------
################################################################################

# Reading in, Combining All Market-level Data Exchange Workbooks----------------
path <- paste0(data_path, "DAIDataExchange")
#finding all the files that have to be read, excluding non-data exchange ones
files_to_read <- dir(path, full.names = T, recursive = T)

#combining all data
revenue_data <- map_dfr(files_to_read, .f = format_data)
## NOTE: This can take considerable time, as the formatting for the files
## was inconsistent, and the function imports files until the format seems 
## reasonable

# Cleaning Revenue Information -------------------------------------------------
#making sure that District is correct for all markets
revenue_data <- revenue_data %>% group_by(`Market Name`) %>%  
  mutate(District = Mode(District)) %>% ungroup()

#Fixing different spellings of M'mbelwa
revenue_data <- revenue_data %>% 
  mutate(District = if_else(District == "Mmbelwa" | District == "MMBELWA",
                            "M'mbelwa", District))

#Making sure that all market names match their name in rest of documents we have
#especially IPA surveys. We do this by merging in list of alternative names
#with corresponding "official" names
revenue_data <- revenue_data %>% 
  left_join(read_csv(paste0(data_path,"alt_names.csv")), 
            by = c("Market Name" = "alt_names"))

#creating unique year, month, market ID
revenue_data <- revenue_data %>% 
  mutate(yr_mnth_mkt = paste(year, 
                             ifelse(nchar(match(month, month.name)) < 2, 
                                    paste0("0", match(month, month.name)),
                                    match(month, month.name)), 
                             str_replace_all(official_name, " ", "_"), 
                             sep = "_"))
#then put it out in front
revenue_data <- select(revenue_data, yr_mnth_mkt, year, month, District,
                       official_name, everything()) %>% 
#and arrange data according to market and then ID
  arrange(official_name, yr_mnth_mkt)

#removing rows with the same unique ID (because some workbooks were uploaded 2x)
duplicates <- revenue_data$yr_mnth_mkt[duplicated(revenue_data$yr_mnth_mkt)]
revenue_data <- revenue_data[!duplicated(revenue_data$yr_mnth_mkt), ]

#renaming most important columns so that it is easier to work with them
revenue_data <- revenue_data %>% 
  rename(market_name_old = `Market Name`,
         treatment_status = `Treatment Assignment`,
         market_fees_collected = `Total market fees received this month (amount in MK)`,
         num_fee_collectors = `Number of fee collectors (#)`,
         num_receipts = `Total number of receipts returned this month (#)`,
         fc_mm_rc_salaries = `Amount spent on fee collector / market manager / revenue collector salaries this month for this market (amount in MK)`,
         cleaners_expenditures = `Amount spent paying cleaners this month (amount in MK)`,
         security_expenditures = `Amount spent paying security this month (amount in MK)`,
         trash_col_expenditures = `Amount spent paying for trash collection this month (amount in MK)`,
         utilities_expenditures = `Amount spent on utilities (water, electricity) this month (amount in MK)`,
         non_market_expenditure = `Amount of market revenues spent on non-market uses (amount in MK)`,
         other_expenditures = `Other market spending this month (amount in MK)`,
         vendor_counts_mkt_day = `Vendor count on market days (#)`,
         vendor_counts_nonmkt_day = `Vendor count on non-market days (#)`,
         revenue_target = `Revenue target given (amount in MK or write "NA" if didn't get a revenue target)`,
         SMS_date = `Date SMS message sent last month (date or write "NA" if wasn't sent)`,
         SMS_message = `Content of SMS message (text)`,
         BU_components = `What pieces of the bottom-up intervention were carried out by LGAP in this market in the past month? (text)`,
         BU_components_fail = `In the past month, did this market fail to receive any part of the bottom-up intervention that it was supposed to receive? If so please describe what happened and why.`,
         BU_compliance_issue = `In the past month, did this market receive any part of the bottom-up intervention that it was not supposed to receive? PLEASE REPORT ANY ISSUES, EVEN IF LGAP WAS NOT THE ONE WHO CARRIED OUT THE INTERVENTION OR ACTIVITY.`,
         TD_components = `What pieces of the top-down intervention were carried out by LGAP in this market in the past month? If so please describe.`,
         TD_components_fail = `In the past month, did this market fail to receive any part of the top-down intervention that it was supposed to receive?`,
         TD_compliance_issues = `In the past month, did this market receive any part of the top-down intervention that it was not supposed to receive? PLEASE REPORT ANY ISSUES, EVEN IF LGAP WAS NOT THE ONE WHO CARRIED OUT THE INTERVENTION OR ACTIVITY.`,
         other_LGAP_act = `What other LGAP activities were carried out within 10 KM of this market?`,
         nonLGAP_act_in_mkt = `What non-LGAP projects are you aware of that took place in this market in the past month? (For example other NGO, donor, or government projects)`,
         nonLGAP_act_10km = `What non-LGAP projects are you aware of that took place within 10 km of this market in the past month?`,
         protest = `Did vendors in this market make any protests or complaints in the past month?`,
         protest_descr = `If yes, please describe:`,
         other_issues = `Is there anything else we should know about what happened in this market or the surrounding area in the past month?`
         )

# turning numeric cols back into numeric
# Note that this turns potentially useful messages into NAs
# Therefore we retain the old version as well
revenue_data <- revenue_data %>% 
  mutate(market_fees_collected_cl = as.numeric(market_fees_collected),
         num_fee_collectors_cl = as.numeric(num_fee_collectors))

# creating indicator vecs for payment issues
revenue_data <- revenue_data %>% 
  mutate(fc_delay_pay = as.numeric(grepl("not paid|last", fc_mm_rc_salaries)),
         fc_commission = as.numeric(grepl("commission", fc_mm_rc_salaries)))

#fix missing treatment status from Chilobwe from November 2017
revenue_data[revenue_data$yr_mnth_mkt == "2017_11_Chilobwe" ,
             "treatment_status"] <- "BU"

# creating treatment indicator vecs
revenue_data <- revenue_data %>%
  mutate(TD_treat = as.numeric(grepl("TD|Both|All", treatment_status)),
         BU_treat = as.numeric(grepl("BU|Both|All", treatment_status)),
         BU = ifelse(BU_treat == 1 & TD_treat == 0,
                     1, 0),
         TD = ifelse(BU_treat == 0 & TD_treat == 1,
                     1, 0),
         Both = ifelse(BU_treat == 1 & TD_treat == 1,
                       1, 0))

#renaming treatment status
revenue_data <- revenue_data %>% 
  rename( "treatment_status_DE" = "treatment_status")

#create month_year factor variable
revenue_data <- revenue_data %>% 
  mutate(month_year = factor(paste(month, year),
                             levels = c("November 2017", "December 2017", 
                                        "January 2018",
                                        "February 2018", "March 2018", 
                                        "April 2018", "May 2018", "June 2018",
                                        "July 2018", "August 2018",
                                        "September 2018", "October 2018", 
                                        "November 2018", "December 2018")))

#making year a numeric variable
revenue_data$year <- as.numeric(revenue_data$year)

#make a variable where all 0s are treated as NAs
revenue_data <- revenue_data %>% 
  mutate(market_fees_collected_cl_no0s = ifelse(market_fees_collected_cl == 0, 
                                                NA, market_fees_collected_cl))

#fix spikes in January and February 2018
#by treating as NA
revenue_data <- revenue_data %>% 
  mutate(market_fees_collected_cl_no0s_fix1 = ifelse(market_fees_collected_cl_no0s > 11000000,
                                                     NA,
                                                     market_fees_collected_cl_no0s))
revenue_data$market_fees_collected_cl_no0s_fix1[revenue_data$yr_mnth_mkt ==
                                                  "2017_11_Manyamula"] <- NA

#by "fixing" with most likely value (extra 1 in front for Jenda and Mitundu, 
#extra 0 for Manyamula)
revenue_data <- revenue_data %>% 
  mutate(market_fees_collected_cl_no0s_fix2 = market_fees_collected_cl_no0s)
revenue_data$market_fees_collected_cl_no0s_fix2[revenue_data$yr_mnth_mkt ==
                                                  "2018_01_Mitundu"] <- 1810650
revenue_data$market_fees_collected_cl_no0s_fix2[revenue_data$yr_mnth_mkt ==
                                                  "2018_02_Jenda_Market"] <- 1346000
revenue_data$market_fees_collected_cl_no0s_fix2[revenue_data$yr_mnth_mkt ==
                                                  "2017_11_Manyamula"] <- 40000
#by "fixing" with most likely value (extra 1 in front for Mitundu, extra zeros for
#Jenda and Manyamula)
revenue_data <- revenue_data %>% 
  mutate(market_fees_collected_cl_no0s_fix3 = market_fees_collected_cl_no0s_fix2)
revenue_data$market_fees_collected_cl_no0s_fix3[revenue_data$yr_mnth_mkt ==
                                                  "2018_02_Jenda_Market"] <- 1134600

#create logged versions of market revenue variables
revenue_data <- revenue_data %>% 
  mutate(market_fees_collected_cl_log = log(market_fees_collected_cl + 1),
         #we add 1 to retain 0s
         market_fees_collected_cl_no0s_log = log(market_fees_collected_cl_no0s))

################################################################################
# Monthly Market Information Merge ---------------------------------------------
################################################################################
# load in .csv file with market_month data
market_month_data <- read_csv(paste0(data_path,"market_month_data.csv"))

# merge this with data exchange workbook data
market_month <- full_join(revenue_data, market_month_data, 
                          by = c( "yr_mnth_mkt" = "yr_mnth_mkt",
                                  "year" = "year",
                                  "month" = "month",
                                  "official_name" = "official_name",
                                  "District" = "District"))

#make treatment_status easier to read
market_month <- market_month %>% 
  mutate(treatment_status = ifelse(grepl("Top", treatment_status), "TD",
                                   ifelse(grepl("Bottom", treatment_status), "BU",
                                          ifelse(grepl("Control", treatment_status), "Control", 
                                                 ifelse(grepl("Both", treatment_status), 
                                                        "BOTH", treatment_status)))))

# create revenue collected variable normalized by market fee
# so variable is market fee units collected in market
market_month <- market_month %>% 
  mutate(market_fees_collected_cl_st = market_fees_collected_cl/market_fee,
         market_fees_collected_cl_no0s_st = market_fees_collected_cl_no0s/market_fee,
         market_fees_collected_cl_st_log = log((market_fees_collected_cl + 1)/market_fee),
         market_fees_collected_cl_no0s_st_log = log(market_fees_collected_cl_no0s_st),
         market_fees_collected_cl_no0s_fix1_st = market_fees_collected_cl_no0s_fix1/market_fee,
         market_fees_collected_cl_no0s_fix2_st = market_fees_collected_cl_no0s_fix2/market_fee,
         market_fees_collected_cl_no0s_fix3_st = market_fees_collected_cl_no0s_fix3/market_fee)

#create revenue collected variable per fee collector
market_month <- market_month %>% 
  mutate(market_fees_collected_cl_fc = ifelse(num_fee_collectors_cl > 0,
                                              market_fees_collected_cl/num_fee_collectors_cl,
                                              NA),
         market_fees_collected_cl_fc_log = ifelse(num_fee_collectors_cl > 0,
                                              log((market_fees_collected_cl + 1)/num_fee_collectors_cl),
                                              NA),
         market_fees_collected_cl_no0s_fc = ifelse(num_fee_collectors_cl > 0,
                                               market_fees_collected_cl_no0s/num_fee_collectors_cl,
                                               NA),
         market_fees_collected_cl_no0s_fix1_fc = ifelse(num_fee_collectors_cl > 0,
                                                   market_fees_collected_cl_no0s_fix1/num_fee_collectors_cl,
                                                   NA),
         market_fees_collected_cl_no0s_fix2_fc = ifelse(num_fee_collectors_cl > 0,
                                                   market_fees_collected_cl_no0s_fix2/num_fee_collectors_cl,
                                                   NA),
         market_fees_collected_cl_no0s_fix3_fc = ifelse(num_fee_collectors_cl > 0,
                                                   market_fees_collected_cl_no0s_fix3/num_fee_collectors_cl,
                                                   NA),
         market_fees_collected_cl_no0s_fc_log = log(market_fees_collected_cl_no0s_fc),
         market_fees_collected_cl_st_fc = ifelse(num_fee_collectors_cl > 0,
                                                 market_fees_collected_cl_st/num_fee_collectors_cl,
                                                 NA),
         market_fees_collected_cl_st_fc_log = ifelse(num_fee_collectors_cl > 0,
                                                 log((market_fees_collected_cl_st + 1)/num_fee_collectors_cl),
                                                 NA),
         market_fees_collected_cl_no0s_st_fc = ifelse(num_fee_collectors_cl > 0,
                                                      market_fees_collected_cl_no0s_st/num_fee_collectors_cl,
                                                      NA),
         market_fees_collected_cl_no0s_fix1_st_fc = ifelse(num_fee_collectors_cl > 0,
                                                      market_fees_collected_cl_no0s_fix1_st/num_fee_collectors_cl,
                                                      NA),
         market_fees_collected_cl_no0s_fix2_st_fc = ifelse(num_fee_collectors_cl > 0,
                                                      market_fees_collected_cl_no0s_fix2_st/num_fee_collectors_cl,
                                                      NA),
         market_fees_collected_cl_no0s_fix2_st_fc = ifelse(num_fee_collectors_cl > 0,
                                                      market_fees_collected_cl_no0s_fix3_st/num_fee_collectors_cl,
                                                      NA),
         market_fees_collected_cl_no0s_st_fc_log = log(market_fees_collected_cl_no0s_st_fc))


#fix vendor counting issue problem (markets marked as having issue for Nov/Dec
#2018, when that wasn't supposed to be active)
market_month <- market_month %>% 
  mutate(vendor_count_issue = ifelse(treatment_status %in% c("TD", "Both") &
                                       month %in% c("November", "December") &
                                       year == 2018, -1, vendor_count_issue))

#create grievance_resp_none and grievance_resp_some
market_month <- market_month %>% 
  mutate(grievance_resp_none = 1*(grievance_mgs_received > 0 &
                                    grievance_resp == 0),
         grievance_resp_some = 1*(grievance_resp < grievance_mgs_received))

################################################################################
# Block ID Merge ---------------------------------------------------------------
################################################################################

#merge in treatment assignment block ids
market_month <- left_join(market_month, 
                          read_dta(paste0(data_path, "block_ids.dta")) %>% 
                            mutate_if(is.labelled, to_factor),
                          by = c("official_name" = "market", "District" = "district"))

################################################################################
# Incentives Information Merge -------------------------------------------------
################################################################################

# load in .xlsx file with hand coded incentives data
incentives <- readxl::read_xlsx("data/1_raw/incentives_check_data.xlsx")

#cutting out unnecessary duplicated variables
incentives <- select(incentives, -(year:month_year)) 

#making met_targets_LGAP_init variable
incentives <- incentives %>% 
  mutate(met_target_LGAP_init = if_else(met_target_incentives_folder_upper == "YES",
                                        1,
                                        0))

# merge incentives into market_month
market_month <- full_join(market_month, incentives, 
                          by = c( "yr_mnth_mkt" = "yr_mnth_mkt"))

################################################################################
# Aggregated Mobile Money Data Merge -------------------------------------------
################################################################################
load(paste0(clean_path, "mobile_money.RData"))

#drop January data (not necessary in market_month)
mobile_mon_monthly <- filter(mobile_mon_monthly, month_str != "January")

#drop unnecessary + duplicated variables
mobile_mon_monthly <- mobile_mon_monthly %>% 
  select(yr_mnth_mkt, mob_mon_transf, mob_mon_trnsctns, Market_Number)

market_month <- full_join(market_month, mobile_mon_monthly,
                          by = "yr_mnth_mkt")

#make global mobile_mon_active variable
market_month <- market_month %>% 
  mutate(mob_mon_active = ifelse(treatment_status %in% c("TD", "Both"),
                                 ifelse(mob_mon_trnsctns == 0 |
                                          is.na(mob_mon_trnsctns), 0, 1),
                                 NA),
         mob_mon_active_all = ifelse(is.na(mob_mon_active),
                                     0, mob_mon_active))

# Saving -----------------------------------------------------------------------
save(market_month, file = "data/2_clean/month_data_merged.RData")

# End --------------------------------------------------------------------------






